library(tidyverse) # for everything
library(readxl) # for reading in excel files
library(janitor) # data checks and cleaning
library(glue) # for easy pasting
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(rstatix) # for stats
library(pheatmap) # for heatmaps
library(plotly) # for interactive plots
library(htmlwidgets) # for saving interactive plots
library(devtools)
library(notame) # used for feature clustering
library(doParallel)
library(igraph) # feature clustering
library(ggpubr) # visualizations
library(knitr) # clean table printing
library(mixOmics) # for multilevel PCAs# raw filtered metabolomics data in C18 (+)
omicsdata <- read_csv("Feature lists/C18Pos-Filtered-Data-05Jun23_946features.csv")
# metadata
metadata <- read_excel("Metadata-urine-c18pos.xlsx")metadata <- metadata %>%
rename("sample_ID" = Sample_ID)# rename "row ID"
omicsdata <- omicsdata %>%
rename("row_ID" = `row ID`)
# how many features
nrow(omicsdata)## [1] 946
# are there any duplicates?
omicsdata %>% get_dupes(mz_rt)## # A tibble: 2 × 60
## mz_rt dupe_count row_ID `6101_U4_C18POS_14` `6110_U2_C18POS_34`
## <chr> <int> <dbl> <dbl> <dbl>
## 1 369.152_4.502 2 4830 40321. 3464.
## 2 369.152_4.502 2 4832 40321. 3464.
## # ℹ 55 more variables: `6104_U2_C18POS_19` <dbl>, `6109_U4_C18POS_22` <dbl>,
## # `6111_U1_C18POS_51` <dbl>, `6110_U3_C18POS_45` <dbl>,
## # `6105_U1_C18POS_43` <dbl>, `6111_U4_C18POS_42` <dbl>,
## # `6113_U2_C18POS_31` <dbl>, `6105_U2_C18POS_40` <dbl>,
## # `6109_U1_C18POS_58` <dbl>, `6113_U4_C18POS_62` <dbl>,
## # `6102_U1_C18POS_26_1` <dbl>, `6102_U2_C18POS_16` <dbl>,
## # `6105_U4_C18POS_47` <dbl>, `6105_U3_C18POS_39` <dbl>, …
Remove duplicates
# remove dupes
omicsdata <- omicsdata %>%
distinct(mz_rt, .keep_all = TRUE)
# check again for dupes
omicsdata %>% get_dupes(mz_rt)## # A tibble: 0 × 60
## # ℹ 60 variables: mz_rt <chr>, dupe_count <int>, row_ID <dbl>,
## # 6101_U4_C18POS_14 <dbl>, 6110_U2_C18POS_34 <dbl>, 6104_U2_C18POS_19 <dbl>,
## # 6109_U4_C18POS_22 <dbl>, 6111_U1_C18POS_51 <dbl>, 6110_U3_C18POS_45 <dbl>,
## # 6105_U1_C18POS_43 <dbl>, 6111_U4_C18POS_42 <dbl>, 6113_U2_C18POS_31 <dbl>,
## # 6105_U2_C18POS_40 <dbl>, 6109_U1_C18POS_58 <dbl>, 6113_U4_C18POS_62 <dbl>,
## # 6102_U1_C18POS_26_1 <dbl>, 6102_U2_C18POS_16 <dbl>,
## # 6105_U4_C18POS_47 <dbl>, 6105_U3_C18POS_39 <dbl>, …
# how many features
nrow(omicsdata)## [1] 945
Remove weird empty column
colnames(omicsdata)## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
## [59] "...59"
# remove weird lgl column
omicsdata <- omicsdata %>%
dplyr::select(!where(is.logical))
colnames(omicsdata)## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
# create long df for omics df
omicsdata_tidy <- omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and omics dfs
df_combined <- full_join(omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
df_combined_sep <- df_combined %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
df_combined_sep$mz <- as.numeric(df_combined_sep$mz)
df_combined_sep$rt <- as.numeric(df_combined_sep$rt)
df_combined_sep$Subject <- as.character(df_combined_sep$Subject)
df_combined_sep$Intervention <- as.character(df_combined_sep$Intervention)
# rearrange column order
df_combined_sep <- df_combined_sep %>%
dplyr::select(sample_ID, pre_post, Intervention, everything())
str(df_combined_sep)## tibble [52,920 × 14] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:52920] "6101_U4_C18POS_14" "6110_U2_C18POS_34" "6104_U2_C18POS_19" "6109_U4_C18POS_22" ...
## $ pre_post : chr [1:52920] "post" "post" "post" "post" ...
## $ Intervention : chr [1:52920] "Yellow" "Yellow" "Yellow" "Red" ...
## $ mz : num [1:52920] 227 227 227 227 227 ...
## $ rt : num [1:52920] 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 ...
## $ row_ID : num [1:52920] 37 37 37 37 37 37 37 37 37 37 ...
## $ peak_height : num [1:52920] 82834 159688 140461 47134 83790 ...
## $ Subject : chr [1:52920] "6101" "6110" "6104" "6109" ...
## $ Period : chr [1:52920] "U4" "U2" "U2" "U4" ...
## $ sequence : chr [1:52920] "R_Y" "Y_R" "Y_R" "Y_R" ...
## $ Intervention_week: num [1:52920] 14 6 6 14 2 10 2 14 6 6 ...
## $ Sex : chr [1:52920] "F" "M" "M" "F" ...
## $ Age : num [1:52920] 58 36 54 75 46 36 40 46 61 40 ...
## $ BMI : num [1:52920] 31.1 29.9 33.1 43.3 30 ...
# replace NA's in subject and intervention columns with QC
df_combined_sep$Subject <- df_combined_sep$Subject %>%
replace_na("QC")
df_combined_sep$Intervention <- df_combined_sep$Intervention %>%
replace_na("QC")nrow(omicsdata)## [1] 945
range(df_combined_sep$mz)## [1] 61.0395 1106.5217
range(df_combined_sep$rt)## [1] 0.553 10.707
# plot
(plot_mzvsrt <- df_combined_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features"))df_combined_sep %>%
ggplot(aes(x = mz)) +
geom_histogram(binwidth = 25) +
theme_minimal() +
labs(x = "Monoisotopic mass (amu)",
y = "Number of features",
title = "Distribution of features by mass")df_combined_sep %>%
ggplot(aes(x = rt)) +
geom_histogram(binwidth = 0.1) + # 6 second bins
theme_minimal() +
labs(x = "Retention time",
y = "Number of features",
title = "Distribution of features by retention time")# NAs in all data including QCs
NAbyRow <- rowSums(is.na(omicsdata[,-1]))
hist(NAbyRow,
breaks = 56, # because there are 56 samples, 48 samples + 8 QCs
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")# samples only (no QCs)
omicsdata_noQC <- omicsdata %>%
dplyr::select(-contains("QC"))
#NAs in samples only?
NAbyRow_noQC <- rowSums(is.na(omicsdata_noQC[,-1]))
hist(NAbyRow_noQC,
breaks = 48, # because there are 48 samples
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")Are there any missing values in QCs? There shouldn’t be after data preprocessing/filtering
omicsdata_QC <- omicsdata %>%
dplyr::select(starts_with("P"))
NAbyRow_QC <- colSums(is.na(omicsdata_QC))
# lets confirm that there are no missing values from my QCs
sum(NAbyRow_QC) # no## [1] 0
# calculate how many NAs there are per feature in whole data set
contains_NAs <- df_combined %>%
group_by(mz_rt) %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(contains_NAs)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 44
## 2 106.0133_6.888 TRUE 36
## 3 1071.4352_3.059 TRUE 44
## 4 1106.5217_3.833 TRUE 44
## 5 119.0485_0.793 TRUE 36
## 6 119.0492_3.019 TRUE 21
NAs by groups
#calculate NAs per feature in red intervention
NAs_Red_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Red") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Red_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 15
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 19
## 6 119.0492_3.019 TRUE 12
#calculate NAs per feature in yellow intervention
NAs_Yellow_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Yellow") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Yellow_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 21
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 17
## 6 119.0492_3.019 TRUE 9
#calculate NAs per feature in before both interventions
NAs_preIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "pre") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_preIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 17
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 10
#calculate NAs per feature after both interventions
NAs_postIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "post") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_postIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 19
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 11
To try and handle outliers, I came up with filtering out metabolites that are only present in one person. i.e. remove metabolites that are missing from at least 44 samples. I am taking this bit out for now as it did not change anything
# remove features that have 44 or more NAs
omit_features <- contains_NAs %>%
filter(n >= 44)
#preview
nrow(omit_features) # features to remove
# how many features to remove?
nrow(omicsdata) - nrow(omit_features)
# now remove these features from the omics dataset
omicsdata <- omicsdata %>%
anti_join(omit_features,
by = "mz_rt")
# how many features are there now?
nrow(omicsdata)# impute any missing values by replacing them with 1/2 of the lowest peak height value of a feature (i.e. in a row).
imputed_omicsdata <- omicsdata
imputed_omicsdata[] <- lapply(imputed_omicsdata,
function(x) ifelse(is.na(x),
min(x, na.rm = TRUE)/2, x))
dim(imputed_omicsdata)## [1] 945 58
Are there any NAs?
imputed_omicsdata %>%
is.na() %>%
sum()## [1] 0
# imputations worked# create long df for imputed omics df
imputed_omicsdata_tidy <- imputed_omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and imputed omics dfs
imputed_fulldata <- full_join(imputed_omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
imputed_fulldata_sep <- imputed_fulldata %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
imputed_fulldata_sep$mz <- as.numeric(imputed_fulldata_sep$mz)
imputed_fulldata_sep$rt <- as.numeric(imputed_fulldata_sep$rt)
imputed_fulldata_sep$Subject <- as.character(imputed_fulldata_sep$Subject)
imputed_fulldata_sep$Intervention <- as.character(imputed_fulldata_sep$Intervention)# rt vs mz plot
imputed_fulldata_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "RT (min)",
y = "mz")
# Notame feature reduction vignette for reference
#browseVignettes("notame")# create features list from imputed data set to only include unique feature ID's (mz_rt), mz and RT
features <- imputed_fulldata_sep %>%
cbind(imputed_fulldata$mz_rt) %>%
rename("mz_rt" = "imputed_fulldata$mz_rt") %>%
dplyr::select(c(mz_rt, mz, rt)) %>%
distinct() # remove the duplicate rows
# create a second data frame which is just imputed_fulldata restructured to another wide format
data_notame <- data.frame(imputed_omicsdata %>%
dplyr::select(-row_ID) %>%
t())
data_notame <- data_notame %>%
tibble::rownames_to_column() %>% # change samples from rownames to its own column
row_to_names(row_number = 1) # change the feature IDs (mz_rt) from first row obs into column namesCheck structures
# check if mz and rt are numeric
str(features)## 'data.frame': 945 obs. of 3 variables:
## $ mz_rt: chr "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ mz : num 227 159 175 189 189 ...
## $ rt : num 0.553 0.608 0.616 0.616 0.615 0.621 0.62 0.633 0.635 0.636 ...
tibble(features)## # A tibble: 945 × 3
## mz_rt mz rt
## <chr> <dbl> <dbl>
## 1 226.9516_0.553 227. 0.553
## 2 159.1492_0.608 159. 0.608
## 3 175.1442_0.616 175. 0.616
## 4 189.1684_0.616 189. 0.616
## 5 189.16_0.615 189. 0.615
## 6 156.0769_0.621 156. 0.621
## 7 170.0926_0.62 170. 0.62
## 8 136.0482_0.633 136. 0.633
## 9 151.1443_0.635 151. 0.635
## 10 137.071_0.636 137. 0.636
## # ℹ 935 more rows
# check if results are numeric
tibble(data_notame)## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <chr> <chr> <chr> <chr>
## 1 6101_U4_… " 82834.1800" " 40852.7970" " 13575.6550" " 19074.8930"
## 2 6110_U2_… " 159688.0300" " 17948.1500" " 21984.4790" " 17190.7050"
## 3 6104_U2_… " 140460.9000" " 32255.4550" " 14964.8470" " 20831.3890"
## 4 6109_U4_… " 47134.4200" " 63559.5400" " 52516.5600" " 24691.2460"
## 5 6111_U1_… " 83789.7700" " 131795.4400" " 47572.5400" " 31355.4630"
## 6 6110_U3_… " 115715.8700" " 71032.5500" " 14294.8420" " 27822.7420"
## 7 6105_U1_… " 141117.8600" " 72057.3500" " 44426.5080" " 28435.2440"
## 8 6111_U4_… " 103462.300" " 120798.030" " 50446.316" " 33316.652"
## 9 6113_U2_… " 121278.8000" " 92756.7900" " 37672.3800" " 34728.8440"
## 10 6105_U2_… " 92647.2340" " 98266.6800" " 55319.7970" " 26269.3400"
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <chr>, `156.0769_0.621` <chr>,
## # `170.0926_0.62` <chr>, `136.0482_0.633` <chr>, `151.1443_0.635` <chr>,
## # `137.071_0.636` <chr>, `182.0574_0.654` <chr>, `162.1126_0.642` <chr>,
## # `76.0757_0.642` <chr>, `114.0669_0.645` <chr>, `227.1255_0.647` <chr>,
## # `193.1547_0.646` <chr>, `219.1705_0.65` <chr>, `163.1243_0.654` <chr>,
## # `213.1234_0.672` <chr>, `203.1502_0.654` <chr>, `138.0551_0.654` <chr>, …
# change to results to numeric
data_notame <- data_notame %>%
mutate_at(-1, as.numeric)
tibble(data_notame)## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6101_U4_… 82834. 40853. 13576. 19075.
## 2 6110_U2_… 159688. 17948. 21984. 17191.
## 3 6104_U2_… 140461. 32255. 14965. 20831.
## 4 6109_U4_… 47134. 63560. 52517. 24691.
## 5 6111_U1_… 83790. 131795. 47573. 31355.
## 6 6110_U3_… 115716. 71033. 14295. 27823.
## 7 6105_U1_… 141118. 72057. 44427. 28435.
## 8 6111_U4_… 103462. 120798. 50446. 33317.
## 9 6113_U2_… 121279. 92757. 37672. 34729.
## 10 6105_U2_… 92647. 98267. 55320. 26269.
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <dbl>, `156.0769_0.621` <dbl>,
## # `170.0926_0.62` <dbl>, `136.0482_0.633` <dbl>, `151.1443_0.635` <dbl>,
## # `137.071_0.636` <dbl>, `182.0574_0.654` <dbl>, `162.1126_0.642` <dbl>,
## # `76.0757_0.642` <dbl>, `114.0669_0.645` <dbl>, `227.1255_0.647` <dbl>,
## # `193.1547_0.646` <dbl>, `219.1705_0.65` <dbl>, `163.1243_0.654` <dbl>,
## # `213.1234_0.672` <dbl>, `203.1502_0.654` <dbl>, `138.0551_0.654` <dbl>, …
connection <- find_connections(data = data_notame,
features = features,
corr_thresh = 0.9,
rt_window = 1/60,
name_col = "mz_rt",
mz_col = "mz",
rt_col = "rt")## [1] 100
## [1] 200
## [1] 300
## [1] 400
## [1] 500
## [1] 600
## [1] 700
## [1] 800
## [1] 900
head(connection)## x y cor rt_diff mz_diff
## 1 151.1443_0.635 76.0757_0.642 0.9772910 0.007 -75.0686
## 2 182.0574_0.654 287.1967_0.655 0.9865474 0.001 105.1393
## 3 114.0669_0.645 227.1255_0.647 0.9689492 0.002 113.0586
## 4 219.1705_0.65 145.1054_0.656 0.9099522 0.006 -74.0651
## 5 144.1023_0.656 145.1054_0.656 0.9856422 0.000 1.0031
## 6 343.1668_0.698 258.1241_0.69 0.9555601 -0.008 -85.0427
clusters <- find_clusters(connections = connection, d_thresh = 0.8)## 113 components found
##
## Component 100 / 113
## 37 components found
##
## 12 components found
##
## 9 components found
##
## 2 components found
##
## 1 components found
# assign a cluster ID to all features. Clusters are named after feature with highest median peak height
features_clustered <- assign_cluster_id(data_notame, clusters, features, name_col = "mz_rt")
# export clustered feature list
write_csv(features_clustered,
"features_notame-clusters_c18-pos.csv")
# visualize clusters
#visualize_clusters(data_notame, features, clusters, min_size = 3, rt_window = 2,name_col = "mz_rt", mz_col = "mz", rt_col = "rt", file_path = "~/path/to/project/")
# lets see how many features are removed when we only keep one feature per cluster
pulled <- pull_clusters(data_notame, features_clustered, name_col = "mz_rt")
cluster_data <- pulled$cdata
cluster_features <- pulled$cfeatures
nrow(omicsdata) - nrow(cluster_features)## [1] 317
# transpose the full dataset back to wide so that it is more similar to the notame dataset
imputed_fulldata_wide <- imputed_fulldata %>%
dplyr::select(-"row_ID") %>%
pivot_wider(names_from = mz_rt,
values_from = peak_height)
# list of reduced features
clusternames <- cluster_features$mz_rt
#dplyr:: only the features are in the reduced list
imp_clust <- imputed_fulldata_wide[,c(names(imputed_fulldata_wide) %in% clusternames)]
# bind back sample names
imp_clust <- cbind(imputed_fulldata_wide[1], imp_clust)
tibble(imp_clust)## # A tibble: 56 × 629
## sample_ID `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6101_U4_… 82834. 40853. 13576. 19075.
## 2 6110_U2_… 159688. 17948. 21984. 17191.
## 3 6104_U2_… 140461. 32255. 14965. 20831.
## 4 6109_U4_… 47134. 63560. 52517. 24691.
## 5 6111_U1_… 83790. 131795. 47573. 31355.
## 6 6110_U3_… 115716. 71033. 14295. 27823.
## 7 6105_U1_… 141118. 72057. 44427. 28435.
## 8 6111_U4_… 103462. 120798. 50446. 33317.
## 9 6113_U2_… 121279. 92757. 37672. 34729.
## 10 6105_U2_… 92647. 98267. 55320. 26269.
## # ℹ 46 more rows
## # ℹ 624 more variables: `189.16_0.615` <dbl>, `156.0769_0.621` <dbl>,
## # `170.0926_0.62` <dbl>, `136.0482_0.633` <dbl>, `137.071_0.636` <dbl>,
## # `162.1126_0.642` <dbl>, `76.0757_0.642` <dbl>, `114.0669_0.645` <dbl>,
## # `193.1547_0.646` <dbl>, `219.1705_0.65` <dbl>, `163.1243_0.654` <dbl>,
## # `213.1234_0.672` <dbl>, `203.1502_0.654` <dbl>, `138.0551_0.654` <dbl>,
## # `146.0812_0.71` <dbl>, `141.0659_0.655` <dbl>, `138.0639_0.655` <dbl>, …
# plot new rt vs mz scatterplot post-clustering
(plot_mzvsrt_postcluster <- cluster_features %>%
ggplot(aes(x = rt,
y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features after clustering"))# plot both scatterplots to compare with and without notame clustering
(scatterplots <- ggarrange(plot_mzvsrt,
plot_mzvsrt_postcluster,
nrow = 2))imp_metabind_clust <- right_join(metadata,
imp_clust,
by = "sample_ID")# change meta data columns to character so that I can change NAs from QCs to "QC"
imp_metabind_clust <- imp_metabind_clust %>%
mutate_at(c("Subject",
"Period",
"Intervention",
"pre_post",
"sequence",
"Intervention_week",
"Sex",
"Age",
"BMI"),
as.character)
# replace NAs in metadata columns for QCs
imp_metabind_clust[is.na(imp_metabind_clust)] <- "QC"
# unite pre_post column with intervention column to create pre_intervention column
imp_metabind_clust <- imp_metabind_clust %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)
# long df
imp_metabind_clust_tidy <- imp_metabind_clust %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund")
# structure check
str(imp_metabind_clust_tidy)## tibble [35,168 × 13] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:35168] "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" ...
## $ Subject : chr [1:35168] "6101" "6101" "6101" "6101" ...
## $ Period : chr [1:35168] "U1" "U1" "U1" "U1" ...
## $ pre_post_intervention: chr [1:35168] "pre_Red" "pre_Red" "pre_Red" "pre_Red" ...
## $ Intervention : chr [1:35168] "Red" "Red" "Red" "Red" ...
## $ pre_post : chr [1:35168] "pre" "pre" "pre" "pre" ...
## $ sequence : chr [1:35168] "R_Y" "R_Y" "R_Y" "R_Y" ...
## $ Intervention_week : chr [1:35168] "2" "2" "2" "2" ...
## $ Sex : chr [1:35168] "F" "F" "F" "F" ...
## $ Age : chr [1:35168] "58" "58" "58" "58" ...
## $ BMI : chr [1:35168] "31.0556029483653" "31.0556029483653" "31.0556029483653" "31.0556029483653" ...
## $ mz_rt : chr [1:35168] "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ rel_abund : num [1:35168] 108697 81884 16201 28592 407452 ...
imp_metabind_clust_tidy %>%
ggplot(aes(x = sample_ID, y = rel_abund, color = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_color_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Unscaled data",
y = "Relative abundance")
Will need to log transform in order to normalize and actually see the
data
imp_metabind_clust_tidy_log2 <- imp_metabind_clust_tidy %>%
mutate(rel_abund_log2 = if_else(rel_abund > 0, log2(rel_abund), 0)) %>%
replace(is.na(.), 0)(bp_data_quality <- imp_metabind_clust_tidy_log2 %>%
ggplot(aes(x = sample_ID, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Log2 transformed data",
y = "Relative abundance"))# go back to wide data
imp_metabind_clust_log2 <- imp_metabind_clust_tidy_log2 %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)
PCA.imp_metabind_clust_log2 <- PCA(imp_metabind_clust_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
kable(summary(PCA.imp_metabind_clust_log2))##
## Call:
## PCA(X = imp_metabind_clust_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 687.364 250.920 104.382 71.249 61.539 50.975 42.072
## % of var. 42.931 15.672 6.519 4.450 3.844 3.184 2.628
## Cumulative % of var. 42.931 58.602 65.122 69.572 73.415 76.599 79.227
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 34.828 28.318 25.591 23.383 21.327 16.986 16.335
## % of var. 2.175 1.769 1.598 1.460 1.332 1.061 1.020
## Cumulative % of var. 81.402 83.171 84.769 86.229 87.561 88.622 89.643
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.322 11.045 10.482 9.477 9.063 8.504 7.583
## % of var. 0.770 0.690 0.655 0.592 0.566 0.531 0.474
## Cumulative % of var. 90.412 91.102 91.757 92.349 92.915 93.446 93.919
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.793 6.653 6.429 5.918 5.274 5.055 4.770
## % of var. 0.424 0.416 0.402 0.370 0.329 0.316 0.298
## Cumulative % of var. 94.344 94.759 95.161 95.530 95.860 96.175 96.473
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.436 4.236 4.044 3.786 3.638 3.397 3.069
## % of var. 0.277 0.265 0.253 0.236 0.227 0.212 0.192
## Cumulative % of var. 96.750 97.015 97.267 97.504 97.731 97.943 98.135
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.045 2.782 2.584 2.496 2.408 2.368 2.261
## % of var. 0.190 0.174 0.161 0.156 0.150 0.148 0.141
## Cumulative % of var. 98.325 98.499 98.660 98.816 98.967 99.114 99.256
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48 Dim.49
## Variance 2.129 1.882 1.751 1.663 1.503 1.442 0.402
## % of var. 0.133 0.118 0.109 0.104 0.094 0.090 0.025
## Cumulative % of var. 99.389 99.506 99.616 99.719 99.813 99.903 99.929
## Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55
## Variance 0.237 0.225 0.190 0.179 0.163 0.151
## % of var. 0.015 0.014 0.012 0.011 0.010 0.009
## Cumulative % of var. 99.943 99.957 99.969 99.980 99.991 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 1 | 29.543 | -20.223 1.062 0.469 | -4.852 0.168
## 2 | 27.838 | -20.615 1.104 0.548 | -5.321 0.201
## 3 | 30.889 | -10.493 0.286 0.115 | -1.654 0.019
## 4 | 29.112 | -19.648 1.003 0.455 | -2.692 0.052
## 5 | 29.423 | -17.451 0.791 0.352 | -3.876 0.107
## 6 | 65.387 | 48.424 6.092 0.548 | -34.715 8.577
## 7 | 32.353 | -17.561 0.801 0.295 | -2.877 0.059
## 8 | 26.968 | -14.112 0.517 0.274 | -3.846 0.105
## 9 | 25.315 | -14.146 0.520 0.312 | -3.675 0.096
## 10 | 31.196 | -13.152 0.449 0.178 | -0.992 0.007
## cos2 Dim.3 ctr cos2
## 1 0.027 | -2.760 0.130 0.009 |
## 2 0.037 | -4.760 0.388 0.029 |
## 3 0.003 | -0.502 0.004 0.000 |
## 4 0.009 | -2.100 0.075 0.005 |
## 5 0.017 | -3.026 0.157 0.011 |
## 6 0.282 | -15.715 4.225 0.058 |
## 7 0.008 | -2.110 0.076 0.004 |
## 8 0.020 | -3.340 0.191 0.015 |
## 9 0.021 | 3.286 0.185 0.017 |
## 10 0.001 | -7.107 0.864 0.052 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | -0.058 0.000 0.013 | 0.108 0.005 0.045 | 0.034
## 159.1492_0.608 | -0.521 0.039 0.329 | -0.300 0.036 0.109 | -0.080
## 175.1442_0.616 | 0.037 0.000 0.002 | 0.343 0.047 0.161 | -0.191
## 189.1684_0.616 | -0.051 0.000 0.007 | 0.005 0.000 0.000 | 0.145
## 189.16_0.615 | -0.017 0.000 0.001 | -0.017 0.000 0.001 | -0.027
## 156.0769_0.621 | -0.011 0.000 0.000 | -0.073 0.002 0.007 | -0.027
## 170.0926_0.62 | -0.104 0.002 0.012 | -0.051 0.001 0.003 | 0.287
## 136.0482_0.633 | 0.013 0.000 0.000 | -0.027 0.000 0.002 | 0.058
## 137.071_0.636 | 0.038 0.000 0.005 | 0.147 0.009 0.079 | 0.004
## 162.1126_0.642 | 0.104 0.002 0.009 | -0.220 0.019 0.042 | 0.264
## ctr cos2
## 226.9516_0.553 0.001 0.005 |
## 159.1492_0.608 0.006 0.008 |
## 175.1442_0.616 0.035 0.050 |
## 189.1684_0.616 0.020 0.056 |
## 189.16_0.615 0.001 0.003 |
## 156.0769_0.621 0.001 0.001 |
## 170.0926_0.62 0.079 0.089 |
## 136.0482_0.633 0.003 0.008 |
## 137.071_0.636 0.000 0.000 |
## 162.1126_0.642 0.067 0.061 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2
## 6101_U1_C18POS_59 | 29.543 | -20.223 0.469 -0.771 | -4.852 0.027
## 6101_U2_C18POS_30 | 33.869 | -16.205 0.229 -0.618 | -3.338 0.010
## 6101_U3_C18POS_29_1 | 28.373 | -19.816 0.488 -0.756 | -4.805 0.029
## 6101_U4_C18POS_14 | 29.664 | -20.017 0.455 -0.764 | -4.285 0.021
## 6102_U1_C18POS_26_1 | 27.838 | -20.615 0.548 -0.786 | -5.321 0.037
## 6102_U2_C18POS_16 | 27.168 | -16.119 0.352 -0.615 | -2.654 0.010
## 6102_U3_C18POS_48 | 35.904 | -13.148 0.134 -0.501 | 0.764 0.000
## 6102_U4_C18POS_50 | 26.805 | -15.784 0.347 -0.602 | -2.070 0.006
## 6103_U1_C18POS_21 | 30.889 | -10.493 0.115 -0.400 | -1.654 0.003
## 6103_U2_C18POS_60 | 33.580 | -14.019 0.174 -0.535 | -1.157 0.001
## v.test Dim.3 cos2 v.test
## 6101_U1_C18POS_59 -0.306 | -2.760 0.009 -0.270 |
## 6101_U2_C18POS_30 -0.211 | 8.185 0.058 0.801 |
## 6101_U3_C18POS_29_1 -0.303 | -2.388 0.007 -0.234 |
## 6101_U4_C18POS_14 -0.270 | -2.225 0.006 -0.218 |
## 6102_U1_C18POS_26_1 -0.336 | -4.760 0.029 -0.466 |
## 6102_U2_C18POS_16 -0.168 | 6.951 0.065 0.680 |
## 6102_U3_C18POS_48 0.048 | 2.064 0.003 0.202 |
## 6102_U4_C18POS_50 -0.131 | -1.432 0.003 -0.140 |
## 6103_U1_C18POS_21 -0.104 | -0.502 0.000 -0.049 |
## 6103_U2_C18POS_60 -0.073 | 9.210 0.075 0.901 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 29.543 | | | -20.223 | 0.469 | -0.771 | | | -4.852 | 0.027 | -0.306 | | | -2.760 | 0.009 | -0.270 | | |
| 6101_U2_C18POS_30 | | | 33.869 | | | -16.205 | 0.229 | -0.618 | | | -3.338 | 0.010 | -0.211 | | | 8.185 | 0.058 | 0.801 | | |
| 6101_U3_C18POS_29_1 | | | 28.373 | | | -19.816 | 0.488 | -0.756 | | | -4.805 | 0.029 | -0.303 | | | -2.388 | 0.007 | -0.234 | | |
| 6101_U4_C18POS_14 | | | 29.664 | | | -20.017 | 0.455 | -0.764 | | | -4.285 | 0.021 | -0.270 | | | -2.225 | 0.006 | -0.218 | | |
| 6102_U1_C18POS_26_1 | | | 27.838 | | | -20.615 | 0.548 | -0.786 | | | -5.321 | 0.037 | -0.336 | | | -4.760 | 0.029 | -0.466 | | |
| 6102_U2_C18POS_16 | | | 27.168 | | | -16.119 | 0.352 | -0.615 | | | -2.654 | 0.010 | -0.168 | | | 6.951 | 0.065 | 0.680 | | |
| 6102_U3_C18POS_48 | | | 35.904 | | | -13.148 | 0.134 | -0.501 | | | 0.764 | 0.000 | 0.048 | | | 2.064 | 0.003 | 0.202 | | |
| 6102_U4_C18POS_50 | | | 26.805 | | | -15.784 | 0.347 | -0.602 | | | -2.070 | 0.006 | -0.131 | | | -1.432 | 0.003 | -0.140 | | |
| 6103_U1_C18POS_21 | | | 30.889 | | | -10.493 | 0.115 | -0.400 | | | -1.654 | 0.003 | -0.104 | | | -0.502 | 0.000 | -0.049 | | |
| 6103_U2_C18POS_60 | | | 33.580 | | | -14.019 | 0.174 | -0.535 | | | -1.157 | 0.001 | -0.073 | | | 9.210 | 0.075 | 0.901 | | |
# pull PC coordinates into df
PC_coord_QC_log2 <- as.data.frame(PCA.imp_metabind_clust_log2$ind$coord)
# bind back metadata from cols 1-10
PC_coord_QC_log2 <- bind_cols(imp_metabind_clust_log2[,1:11], PC_coord_QC_log2)
# grab some variance explained
importance_QC <- PCA.imp_metabind_clust_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_withQC <- round(importance_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_withQC <- round(importance_QC[2,2], 2)Using FactoExtra package
# scree plot
fviz_eig(PCA.imp_metabind_clust_log2)# get eigenvalues
kable(get_eig(PCA.imp_metabind_clust_log2))| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 687.3635857 | 42.9307312 | 42.93073 |
| Dim.2 | 250.9195244 | 15.6717040 | 58.60244 |
| Dim.3 | 104.3818151 | 6.5193847 | 65.12182 |
| Dim.4 | 71.2486921 | 4.4499862 | 69.57181 |
| Dim.5 | 61.5386328 | 3.8435241 | 73.41533 |
| Dim.6 | 50.9750889 | 3.1837559 | 76.59909 |
| Dim.7 | 42.0715518 | 2.6276668 | 79.22675 |
| Dim.8 | 34.8276028 | 2.1752308 | 81.40198 |
| Dim.9 | 28.3178189 | 1.7686486 | 83.17063 |
| Dim.10 | 25.5906472 | 1.5983174 | 84.76895 |
| Dim.11 | 23.3828923 | 1.4604275 | 86.22938 |
| Dim.12 | 21.3271567 | 1.3320322 | 87.56141 |
| Dim.13 | 16.9859662 | 1.0608941 | 88.62230 |
| Dim.14 | 16.3349815 | 1.0202355 | 89.64254 |
| Dim.15 | 12.3221601 | 0.7696063 | 90.41215 |
| Dim.16 | 11.0447746 | 0.6898245 | 91.10197 |
| Dim.17 | 10.4819068 | 0.6546694 | 91.75664 |
| Dim.18 | 9.4767804 | 0.5918922 | 92.34853 |
| Dim.19 | 9.0632267 | 0.5660628 | 92.91459 |
| Dim.20 | 8.5037996 | 0.5311226 | 93.44572 |
| Dim.21 | 7.5825944 | 0.4735868 | 93.91930 |
| Dim.22 | 6.7925969 | 0.4242459 | 94.34355 |
| Dim.23 | 6.6533321 | 0.4155478 | 94.75910 |
| Dim.24 | 6.4287724 | 0.4015224 | 95.16062 |
| Dim.25 | 5.9182169 | 0.3696346 | 95.53025 |
| Dim.26 | 5.2737752 | 0.3293847 | 95.85964 |
| Dim.27 | 5.0549188 | 0.3157155 | 96.17535 |
| Dim.28 | 4.7699607 | 0.2979179 | 96.47327 |
| Dim.29 | 4.4363479 | 0.2770814 | 96.75035 |
| Dim.30 | 4.2356834 | 0.2645485 | 97.01490 |
| Dim.31 | 4.0435758 | 0.2525500 | 97.26745 |
| Dim.32 | 3.7859289 | 0.2364581 | 97.50391 |
| Dim.33 | 3.6380108 | 0.2272196 | 97.73113 |
| Dim.34 | 3.3967746 | 0.2121527 | 97.94328 |
| Dim.35 | 3.0686817 | 0.1916609 | 98.13494 |
| Dim.36 | 3.0453242 | 0.1902021 | 98.32515 |
| Dim.37 | 2.7820573 | 0.1737592 | 98.49890 |
| Dim.38 | 2.5840331 | 0.1613912 | 98.66030 |
| Dim.39 | 2.4960434 | 0.1558956 | 98.81619 |
| Dim.40 | 2.4077761 | 0.1503827 | 98.96657 |
| Dim.41 | 2.3684058 | 0.1479237 | 99.11450 |
| Dim.42 | 2.2614422 | 0.1412431 | 99.25574 |
| Dim.43 | 2.1292283 | 0.1329854 | 99.38873 |
| Dim.44 | 1.8818286 | 0.1175335 | 99.50626 |
| Dim.45 | 1.7514593 | 0.1093911 | 99.61565 |
| Dim.46 | 1.6626426 | 0.1038438 | 99.71949 |
| Dim.47 | 1.5033597 | 0.0938955 | 99.81339 |
| Dim.48 | 1.4422161 | 0.0900766 | 99.90347 |
| Dim.49 | 0.4016522 | 0.0250860 | 99.92855 |
| Dim.50 | 0.2367709 | 0.0147880 | 99.94334 |
| Dim.51 | 0.2246164 | 0.0140289 | 99.95737 |
| Dim.52 | 0.1898383 | 0.0118567 | 99.96923 |
| Dim.53 | 0.1785895 | 0.0111542 | 99.98038 |
| Dim.54 | 0.1633172 | 0.0102003 | 99.99058 |
| Dim.55 | 0.1508037 | 0.0094188 | 100.00000 |
# scores plot
fviz_pca_ind(PCA.imp_metabind_clust_log2)# loadings
fviz_pca_var(PCA.imp_metabind_clust_log2)
### Manual scores plots
# manual scores plot
(PCA_withQCs <- PC_coord_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_withQC/PC1_withQC) +
labs(x = glue::glue("PC1: {PC1_withQC}%"),
y = glue::glue("PC2: {PC2_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data"))imp_metabind_clust_log2_noQCs <- imp_metabind_clust_log2 %>%
filter(Intervention != "QC")
PCA.imp_metabind_clust_log2_noQCs <- PCA(imp_metabind_clust_log2_noQCs, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.imp_metabind_clust_log2_noQCs))##
## Call:
## PCA(X = imp_metabind_clust_log2_noQCs, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 442.675 216.681 85.884 80.110 59.619 51.536 45.498
## % of var. 33.125 16.214 6.427 5.995 4.461 3.856 3.405
## Cumulative % of var. 33.125 49.339 55.766 61.760 66.221 70.078 73.482
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 33.998 29.970 27.966 25.672 19.956 19.069 17.245
## % of var. 2.544 2.243 2.093 1.921 1.493 1.427 1.290
## Cumulative % of var. 76.026 78.269 80.362 82.283 83.776 85.203 86.493
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 13.885 12.834 11.096 10.572 10.013 9.042 8.173
## % of var. 1.039 0.960 0.830 0.791 0.749 0.677 0.612
## Cumulative % of var. 87.532 88.493 89.323 90.114 90.863 91.540 92.152
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.924 7.496 6.923 6.391 6.054 5.626 5.191
## % of var. 0.593 0.561 0.518 0.478 0.453 0.421 0.388
## Cumulative % of var. 92.745 93.305 93.824 94.302 94.755 95.176 95.564
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.938 4.716 4.417 4.243 3.956 3.598 3.557
## % of var. 0.370 0.353 0.331 0.317 0.296 0.269 0.266
## Cumulative % of var. 95.934 96.287 96.617 96.935 97.231 97.500 97.766
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.342 3.044 2.985 2.834 2.790 2.645 2.519
## % of var. 0.250 0.228 0.223 0.212 0.209 0.198 0.188
## Cumulative % of var. 98.016 98.244 98.467 98.679 98.888 99.086 99.275
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 2.227 2.055 1.952 1.769 1.693
## % of var. 0.167 0.154 0.146 0.132 0.127
## Cumulative % of var. 99.441 99.595 99.741 99.873 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 24.474 | -8.948 0.377 0.134 | -7.285 0.510 0.089 |
## 2 | 21.958 | -8.466 0.337 0.149 | -6.522 0.409 0.088 |
## 3 | 29.207 | -2.723 0.035 0.009 | -1.649 0.026 0.003 |
## 4 | 24.365 | -9.702 0.443 0.159 | -5.657 0.308 0.054 |
## 5 | 25.228 | -6.929 0.226 0.075 | -4.957 0.236 0.039 |
## 6 | 70.267 | 66.284 20.677 0.890 | 2.314 0.051 0.001 |
## 7 | 28.559 | -7.378 0.256 0.067 | -4.057 0.158 0.020 |
## 8 | 23.446 | -4.051 0.077 0.030 | -3.374 0.109 0.021 |
## 9 | 21.859 | -5.050 0.120 0.053 | -5.721 0.315 0.068 |
## 10 | 28.598 | -4.366 0.090 0.023 | 0.476 0.002 0.000 |
## Dim.3 ctr cos2
## 1 -9.632 2.251 0.155 |
## 2 -8.522 1.762 0.151 |
## 3 -0.815 0.016 0.001 |
## 4 -7.801 1.476 0.102 |
## 5 -5.084 0.627 0.041 |
## 6 -8.482 1.745 0.015 |
## 7 -9.038 1.981 0.100 |
## 8 -3.846 0.359 0.027 |
## 9 4.862 0.573 0.049 |
## 10 -7.609 1.404 0.071 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | -0.161 0.006 0.087 | 0.002 0.000 0.000 | -0.041
## 159.1492_0.608 | -0.080 0.001 0.011 | -0.245 0.028 0.100 | -0.090
## 175.1442_0.616 | -0.122 0.003 0.018 | 0.420 0.081 0.209 | -0.022
## 189.1684_0.616 | -0.083 0.002 0.016 | -0.091 0.004 0.019 | 0.062
## 189.16_0.615 | -0.029 0.000 0.003 | -0.044 0.001 0.006 | -0.081
## 156.0769_0.621 | -0.038 0.000 0.002 | -0.146 0.010 0.024 | -0.177
## 170.0926_0.62 | -0.207 0.010 0.040 | -0.330 0.050 0.101 | -0.108
## 136.0482_0.633 | 0.001 0.000 0.000 | -0.066 0.002 0.009 | -0.030
## 137.071_0.636 | -0.061 0.001 0.012 | 0.133 0.008 0.056 | -0.011
## 162.1126_0.642 | 0.051 0.001 0.002 | -0.410 0.077 0.131 | -0.279
## ctr cos2
## 226.9516_0.553 0.002 0.006 |
## 159.1492_0.608 0.009 0.014 |
## 175.1442_0.616 0.001 0.001 |
## 189.1684_0.616 0.004 0.009 |
## 189.16_0.615 0.008 0.021 |
## 156.0769_0.621 0.036 0.036 |
## 170.0926_0.62 0.013 0.011 |
## 136.0482_0.633 0.001 0.002 |
## 137.071_0.636 0.000 0.000 |
## 162.1126_0.642 0.091 0.061 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## 6101_U1_C18POS_59 | 24.474 | -8.948 0.134 -0.425 | -7.285 0.089 -0.495 |
## 6101_U2_C18POS_30 | 31.098 | -7.765 0.062 -0.369 | -8.354 0.072 -0.568 |
## 6101_U3_C18POS_29_1 | 23.192 | -8.632 0.139 -0.410 | -7.117 0.094 -0.483 |
## 6101_U4_C18POS_14 | 24.794 | -9.236 0.139 -0.439 | -7.146 0.083 -0.485 |
## 6102_U1_C18POS_26_1 | 21.958 | -8.466 0.149 -0.402 | -6.522 0.088 -0.443 |
## 6102_U2_C18POS_16 | 23.503 | -7.536 0.103 -0.358 | -6.663 0.080 -0.453 |
## 6102_U3_C18POS_48 | 34.281 | -7.219 0.044 -0.343 | -3.010 0.008 -0.204 |
## 6102_U4_C18POS_50 | 23.118 | -7.066 0.093 -0.336 | -4.202 0.033 -0.285 |
## 6103_U1_C18POS_21 | 29.207 | -2.723 0.009 -0.129 | -1.649 0.003 -0.112 |
## 6103_U2_C18POS_60 | 31.461 | -7.029 0.050 -0.334 | -5.851 0.035 -0.397 |
## Dim.3 cos2 v.test
## 6101_U1_C18POS_59 -9.632 0.155 -1.039 |
## 6101_U2_C18POS_30 8.652 0.077 0.934 |
## 6101_U3_C18POS_29_1 -7.474 0.104 -0.807 |
## 6101_U4_C18POS_14 -8.577 0.120 -0.926 |
## 6102_U1_C18POS_26_1 -8.522 0.151 -0.920 |
## 6102_U2_C18POS_16 11.249 0.229 1.214 |
## 6102_U3_C18POS_48 -8.688 0.064 -0.938 |
## 6102_U4_C18POS_50 -7.756 0.113 -0.837 |
## 6103_U1_C18POS_21 -0.815 0.001 -0.088 |
## 6103_U2_C18POS_60 14.838 0.222 1.601 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 24.474 | | | -8.948 | 0.134 | -0.425 | | | -7.285 | 0.089 | -0.495 | | | -9.632 | 0.155 | -1.039 | | |
| 6101_U2_C18POS_30 | | | 31.098 | | | -7.765 | 0.062 | -0.369 | | | -8.354 | 0.072 | -0.568 | | | 8.652 | 0.077 | 0.934 | | |
| 6101_U3_C18POS_29_1 | | | 23.192 | | | -8.632 | 0.139 | -0.410 | | | -7.117 | 0.094 | -0.483 | | | -7.474 | 0.104 | -0.807 | | |
| 6101_U4_C18POS_14 | | | 24.794 | | | -9.236 | 0.139 | -0.439 | | | -7.146 | 0.083 | -0.485 | | | -8.577 | 0.120 | -0.926 | | |
| 6102_U1_C18POS_26_1 | | | 21.958 | | | -8.466 | 0.149 | -0.402 | | | -6.522 | 0.088 | -0.443 | | | -8.522 | 0.151 | -0.920 | | |
| 6102_U2_C18POS_16 | | | 23.503 | | | -7.536 | 0.103 | -0.358 | | | -6.663 | 0.080 | -0.453 | | | 11.249 | 0.229 | 1.214 | | |
| 6102_U3_C18POS_48 | | | 34.281 | | | -7.219 | 0.044 | -0.343 | | | -3.010 | 0.008 | -0.204 | | | -8.688 | 0.064 | -0.938 | | |
| 6102_U4_C18POS_50 | | | 23.118 | | | -7.066 | 0.093 | -0.336 | | | -4.202 | 0.033 | -0.285 | | | -7.756 | 0.113 | -0.837 | | |
| 6103_U1_C18POS_21 | | | 29.207 | | | -2.723 | 0.009 | -0.129 | | | -1.649 | 0.003 | -0.112 | | | -0.815 | 0.001 | -0.088 | | |
| 6103_U2_C18POS_60 | | | 31.461 | | | -7.029 | 0.050 | -0.334 | | | -5.851 | 0.035 | -0.397 | | | 14.838 | 0.222 | 1.601 | | |
# pull PC coordinates into df
PC_coord_noQCs_log2 <- as.data.frame(PCA.imp_metabind_clust_log2_noQCs$ind$coord)
# bind back metadata from cols 1-10
PC_coord_noQCs_log2 <- bind_cols(imp_metabind_clust_log2_noQCs[,1:11], PC_coord_noQCs_log2)
# grab some variance explained
importance_noQC <- PCA.imp_metabind_clust_log2_noQCs$eig
# set variance explained with PC1, round to 2 digits
PC1_noQC <- round(importance_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_noQC <- round(importance_noQC[2,2], 2)Using FactoExtra
# scree plot
fviz_eig(PCA.imp_metabind_clust_log2_noQCs)# scores plot
fviz_pca_ind(PCA.imp_metabind_clust_log2_noQCs)# plot of contributions from features to PC1
(var_contrib_noQCs_PC1 <- fviz_contrib(PCA.imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 1,
top = 25,
title = "Var contribution to PC1: no QCs"))# plot of contributions from features to PC2
(var_contrib_noQCs_PC2 <- fviz_contrib(PCA.imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 2,
top = 25,
title = "Var contribution to PC2: no QCs"))# loadings
fviz_pca_var(PCA.imp_metabind_clust_log2_noQCs) # nightmare
### Manual scores plots
(PCA_withoutQCs <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention,
text = Subject)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gold", "tomato1")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, without QCs"))ggplotly(PCA_withoutQCs, tooltip = "text")(PCA_withoutQCs.pre_post <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed, without QCs"))ggplotly(PCA_withoutQCs.pre_post,
tooltip = "text") # go back to wide data
imp_nooutliers_log2 <- imp_metabind_clust_tidy_log2 %>%
dplyr::select(!rel_abund) %>%
filter(Subject != 6106,
Subject != 6112) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)
PCA.imp_nooutliers_log2 <- PCA(imp_nooutliers_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
summary(PCA.imp_nooutliers_log2)##
## Call:
## PCA(X = imp_nooutliers_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 632.613 80.101 76.289 56.486 50.044 40.829 35.071
## % of var. 50.756 6.427 6.121 4.532 4.015 3.276 2.814
## Cumulative % of var. 50.756 57.183 63.304 67.836 71.851 75.127 77.941
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 27.663 26.301 24.814 18.722 17.912 13.030 12.512
## % of var. 2.219 2.110 1.991 1.502 1.437 1.045 1.004
## Cumulative % of var. 80.160 82.271 84.261 85.764 87.201 88.246 89.250
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 10.771 10.425 8.902 8.482 7.992 7.531 7.138
## % of var. 0.864 0.836 0.714 0.681 0.641 0.604 0.573
## Cumulative % of var. 90.114 90.951 91.665 92.345 92.987 93.591 94.164
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.445 5.903 5.481 5.253 5.135 4.778 4.641
## % of var. 0.517 0.474 0.440 0.421 0.412 0.383 0.372
## Cumulative % of var. 94.681 95.154 95.594 96.016 96.428 96.811 97.183
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.150 3.614 3.335 3.146 2.956 2.889 2.669
## % of var. 0.333 0.290 0.268 0.252 0.237 0.232 0.214
## Cumulative % of var. 97.516 97.806 98.074 98.326 98.563 98.795 99.009
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 2.338 2.293 2.073 2.028 1.755 0.501 0.287
## % of var. 0.188 0.184 0.166 0.163 0.141 0.040 0.023
## Cumulative % of var. 99.197 99.381 99.547 99.710 99.851 99.891 99.914
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 0.264 0.225 0.211 0.192 0.178
## % of var. 0.021 0.018 0.017 0.015 0.014
## Cumulative % of var. 99.935 99.953 99.970 99.986 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 1 | 27.108 | -17.095 0.962 0.398 | 4.057 0.428
## 2 | 25.464 | -17.904 1.056 0.494 | 1.926 0.096
## 3 | 29.736 | -6.656 0.146 0.050 | -9.283 2.241
## 4 | 26.649 | -15.931 0.836 0.357 | -0.353 0.003
## 5 | 27.454 | -14.253 0.669 0.270 | -0.253 0.002
## 6 | 30.652 | -14.275 0.671 0.217 | 11.172 3.246
## 7 | 25.333 | -11.015 0.400 0.189 | -2.724 0.193
## 8 | 23.089 | -10.489 0.362 0.206 | -4.634 0.558
## 9 | 30.150 | -9.736 0.312 0.104 | -0.445 0.005
## 10 | 25.787 | -10.087 0.335 0.153 | -1.443 0.054
## cos2 Dim.3 ctr cos2
## 1 0.022 | 6.885 1.294 0.064 |
## 2 0.006 | 6.767 1.250 0.071 |
## 3 0.097 | 6.223 1.057 0.044 |
## 4 0.000 | 7.230 1.427 0.074 |
## 5 0.000 | 4.367 0.521 0.025 |
## 6 0.133 | 4.980 0.677 0.026 |
## 7 0.012 | 5.769 0.909 0.052 |
## 8 0.040 | -4.607 0.580 0.040 |
## 9 0.000 | 9.802 2.624 0.106 |
## 10 0.003 | 4.534 0.561 0.031 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | 0.019 0.000 0.002 | 0.001 0.000 0.000 | 0.021
## 159.1492_0.608 | -0.630 0.063 0.467 | 0.220 0.061 0.057 | -0.008
## 175.1442_0.616 | 0.101 0.002 0.017 | 0.273 0.093 0.128 | -0.101
## 189.1684_0.616 | -0.008 0.000 0.000 | 0.160 0.032 0.066 | -0.197
## 189.16_0.615 | -0.005 0.000 0.000 | 0.011 0.000 0.000 | -0.019
## 156.0769_0.621 | 0.017 0.000 0.000 | 0.030 0.001 0.001 | 0.066
## 170.0926_0.62 | 0.019 0.000 0.000 | 0.292 0.107 0.113 | -0.191
## 136.0482_0.633 | 0.017 0.000 0.001 | -0.022 0.001 0.001 | 0.065
## 137.071_0.636 | 0.070 0.001 0.019 | 0.078 0.008 0.024 | -0.077
## 162.1126_0.642 | 0.158 0.004 0.029 | 0.205 0.052 0.048 | 0.095
## ctr cos2
## 226.9516_0.553 0.001 0.002 |
## 159.1492_0.608 0.000 0.000 |
## 175.1442_0.616 0.013 0.018 |
## 189.1684_0.616 0.051 0.099 |
## 189.16_0.615 0.000 0.001 |
## 156.0769_0.621 0.006 0.006 |
## 170.0926_0.62 0.048 0.048 |
## 136.0482_0.633 0.005 0.009 |
## 137.071_0.636 0.008 0.023 |
## 162.1126_0.642 0.012 0.010 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2
## 6101_U1_C18POS_59 | 27.108 | -17.095 0.398 -0.680 | 4.057 0.022
## 6101_U2_C18POS_30 | 31.590 | -11.837 0.140 -0.471 | -0.865 0.001
## 6101_U3_C18POS_29_1 | 25.900 | -16.644 0.413 -0.662 | 2.547 0.010
## 6101_U4_C18POS_14 | 27.144 | -16.612 0.375 -0.660 | 0.439 0.000
## 6102_U1_C18POS_26_1 | 25.464 | -17.904 0.494 -0.712 | 1.926 0.006
## 6102_U2_C18POS_16 | 24.521 | -11.875 0.235 -0.472 | -2.611 0.011
## 6102_U3_C18POS_48 | 34.180 | -8.034 0.055 -0.319 | 1.947 0.003
## 6102_U4_C18POS_50 | 24.490 | -11.836 0.234 -0.471 | -0.707 0.001
## 6103_U1_C18POS_21 | 29.736 | -6.656 0.050 -0.265 | -9.283 0.097
## 6103_U2_C18POS_60 | 31.483 | -9.104 0.084 -0.362 | -12.120 0.148
## v.test Dim.3 cos2 v.test
## 6101_U1_C18POS_59 0.453 | 6.885 0.064 0.788 |
## 6101_U2_C18POS_30 -0.097 | -9.156 0.084 -1.048 |
## 6101_U3_C18POS_29_1 0.285 | 5.143 0.039 0.589 |
## 6101_U4_C18POS_14 0.049 | 7.796 0.082 0.893 |
## 6102_U1_C18POS_26_1 0.215 | 6.767 0.071 0.775 |
## 6102_U2_C18POS_16 -0.292 | -10.773 0.193 -1.233 |
## 6102_U3_C18POS_48 0.218 | 9.052 0.070 1.036 |
## 6102_U4_C18POS_50 -0.079 | 8.940 0.133 1.024 |
## 6103_U1_C18POS_21 -1.037 | 6.223 0.044 0.712 |
## 6103_U2_C18POS_60 -1.354 | -9.861 0.098 -1.129 |
# pull PC coordinates into df
PC_nooutliers_QC_log2 <- as.data.frame(PCA.imp_nooutliers_log2$ind$coord)
# bind back metadata from cols 1-11
PC_nooutliers_QC_log2 <- bind_cols(imp_nooutliers_log2[,1:11], PC_nooutliers_QC_log2)
# grab some variance explained
importance_nooutliers_QC <- PCA.imp_nooutliers_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_withQC <- round(importance_nooutliers_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_withQC <- round(importance_nooutliers_QC[2,2], 2)Using FactoExtra package
# scree plot
fviz_eig(PCA.imp_nooutliers_log2)# scores plot
fviz_pca_ind(PCA.imp_nooutliers_log2)# loadings
fviz_pca_var(PCA.imp_nooutliers_log2)##### Red vs yellow
# manual scores plot
(PCA_nooutliers_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_withQC/PC1_nooutliers_withQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_prepost_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_withQC/PC1_nooutliers_withQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot"))ggplotly(PCA_nooutliers_prepost_withQCs,
tooltip = "text") imp_nooutliers_noQCs_log2 <- imp_nooutliers_log2 %>%
filter(Intervention != "QC")
PCA.imp_nooutliers_noQCs_log2 <- PCA(imp_nooutliers_noQCs_log2, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.imp_nooutliers_noQCs_log2))##
## Call:
## PCA(X = imp_nooutliers_noQCs_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 96.330 93.614 68.239 61.589 51.600 44.525 33.683
## % of var. 12.847 12.485 9.101 8.214 6.882 5.938 4.492
## Cumulative % of var. 12.847 25.332 34.433 42.646 49.528 55.466 59.958
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 32.064 30.266 22.584 21.916 17.720 15.259 13.558
## % of var. 4.276 4.036 3.012 2.923 2.363 2.035 1.808
## Cumulative % of var. 64.234 68.271 71.283 74.206 76.569 78.604 80.412
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.561 10.855 10.175 9.598 9.193 8.802 7.867
## % of var. 1.675 1.448 1.357 1.280 1.226 1.174 1.049
## Cumulative % of var. 82.087 83.535 84.892 86.172 87.398 88.572 89.621
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.163 6.670 6.306 6.157 5.729 5.587 5.065
## % of var. 0.955 0.889 0.841 0.821 0.764 0.745 0.676
## Cumulative % of var. 90.576 91.466 92.307 93.128 93.892 94.637 95.313
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.389 3.995 3.826 3.565 3.520 3.207 2.828
## % of var. 0.585 0.533 0.510 0.475 0.469 0.428 0.377
## Cumulative % of var. 95.898 96.431 96.941 97.417 97.886 98.314 98.691
## Dim.36 Dim.37 Dim.38 Dim.39
## Variance 2.760 2.498 2.427 2.132
## % of var. 0.368 0.333 0.324 0.284
## Cumulative % of var. 99.059 99.392 99.716 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 22.060 | 1.908 0.094 0.007 | -9.337 2.328 0.179 |
## 2 | 19.487 | -0.100 0.000 0.000 | -8.659 2.002 0.197 |
## 3 | 29.273 | -9.635 2.409 0.108 | -2.772 0.205 0.009 |
## 4 | 22.045 | -2.243 0.131 0.010 | -8.247 1.816 0.140 |
## 5 | 23.751 | -1.470 0.056 0.004 | -5.165 0.712 0.047 |
## 6 | 27.362 | 9.670 2.427 0.125 | -7.819 1.633 0.082 |
## 7 | 22.793 | -3.614 0.339 0.025 | -4.761 0.605 0.044 |
## 8 | 20.588 | -3.837 0.382 0.035 | 5.035 0.677 0.060 |
## 9 | 28.597 | -2.131 0.118 0.006 | -9.160 2.241 0.103 |
## 10 | 23.814 | -2.336 0.142 0.010 | -4.325 0.500 0.033 |
## Dim.3 ctr cos2
## 1 -4.601 0.776 0.044 |
## 2 -4.313 0.682 0.049 |
## 3 1.037 0.039 0.001 |
## 4 -2.837 0.295 0.017 |
## 5 -1.873 0.128 0.006 |
## 6 2.074 0.158 0.006 |
## 7 3.102 0.353 0.019 |
## 8 -2.865 0.301 0.019 |
## 9 5.032 0.928 0.031 |
## 10 -5.018 0.922 0.044 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | -0.012 0.000 0.000 | -0.044 0.002 0.007 | -0.306
## 159.1492_0.608 | 0.223 0.052 0.094 | -0.071 0.005 0.010 | 0.222
## 175.1442_0.616 | 0.319 0.106 0.147 | 0.062 0.004 0.006 | 0.173
## 189.1684_0.616 | 0.209 0.045 0.094 | 0.169 0.031 0.062 | 0.096
## 189.16_0.615 | 0.006 0.000 0.000 | -0.007 0.000 0.000 | -0.055
## 156.0769_0.621 | 0.002 0.000 0.000 | -0.118 0.015 0.016 | 0.069
## 170.0926_0.62 | 0.335 0.116 0.124 | 0.098 0.010 0.011 | -0.192
## 136.0482_0.633 | -0.040 0.002 0.003 | -0.071 0.005 0.009 | -0.342
## 137.071_0.636 | 0.097 0.010 0.032 | 0.058 0.004 0.011 | -0.106
## 162.1126_0.642 | 0.204 0.043 0.041 | -0.137 0.020 0.018 | 0.121
## ctr cos2
## 226.9516_0.553 0.137 0.329 |
## 159.1492_0.608 0.073 0.094 |
## 175.1442_0.616 0.044 0.043 |
## 189.1684_0.616 0.014 0.020 |
## 189.16_0.615 0.004 0.011 |
## 156.0769_0.621 0.007 0.005 |
## 170.0926_0.62 0.054 0.041 |
## 136.0482_0.633 0.171 0.200 |
## 137.071_0.636 0.016 0.038 |
## 162.1126_0.642 0.022 0.014 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## 6101_U1_C18POS_59 | 22.060 | 1.908 0.007 0.194 | -9.337 0.179 -0.965 |
## 6101_U2_C18POS_30 | 29.326 | 0.452 0.000 0.046 | 8.149 0.077 0.842 |
## 6101_U3_C18POS_29_1 | 20.766 | 0.806 0.002 0.082 | -7.225 0.121 -0.747 |
## 6101_U4_C18POS_14 | 22.314 | -1.659 0.006 -0.169 | -9.165 0.169 -0.947 |
## 6102_U1_C18POS_26_1 | 19.487 | -0.100 0.000 -0.010 | -8.659 0.197 -0.895 |
## 6102_U2_C18POS_16 | 21.425 | -0.804 0.001 -0.082 | 10.445 0.238 1.080 |
## 6102_U3_C18POS_48 | 33.397 | 0.399 0.000 0.041 | -8.801 0.069 -0.910 |
## 6102_U4_C18POS_50 | 21.520 | -2.403 0.012 -0.245 | -8.653 0.162 -0.894 |
## 6103_U1_C18POS_21 | 29.273 | -9.635 0.108 -0.982 | -2.772 0.009 -0.286 |
## 6103_U2_C18POS_60 | 30.106 | -9.772 0.105 -0.996 | 12.656 0.177 1.308 |
## Dim.3 cos2 v.test
## 6101_U1_C18POS_59 -4.601 0.044 -0.557 |
## 6101_U2_C18POS_30 4.239 0.021 0.513 |
## 6101_U3_C18POS_29_1 -6.735 0.105 -0.815 |
## 6101_U4_C18POS_14 -1.658 0.006 -0.201 |
## 6102_U1_C18POS_26_1 -4.313 0.049 -0.522 |
## 6102_U2_C18POS_16 7.005 0.107 0.848 |
## 6102_U3_C18POS_48 2.598 0.006 0.314 |
## 6102_U4_C18POS_50 7.618 0.125 0.922 |
## 6103_U1_C18POS_21 1.037 0.001 0.126 |
## 6103_U2_C18POS_60 8.463 0.079 1.025 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 22.060 | | | 1.908 | 0.007 | 0.194 | | | -9.337 | 0.179 | -0.965 | | | -4.601 | 0.044 | -0.557 | | |
| 6101_U2_C18POS_30 | | | 29.326 | | | 0.452 | 0.000 | 0.046 | | | 8.149 | 0.077 | 0.842 | | | 4.239 | 0.021 | 0.513 | | |
| 6101_U3_C18POS_29_1 | | | 20.766 | | | 0.806 | 0.002 | 0.082 | | | -7.225 | 0.121 | -0.747 | | | -6.735 | 0.105 | -0.815 | | |
| 6101_U4_C18POS_14 | | | 22.314 | | | -1.659 | 0.006 | -0.169 | | | -9.165 | 0.169 | -0.947 | | | -1.658 | 0.006 | -0.201 | | |
| 6102_U1_C18POS_26_1 | | | 19.487 | | | -0.100 | 0.000 | -0.010 | | | -8.659 | 0.197 | -0.895 | | | -4.313 | 0.049 | -0.522 | | |
| 6102_U2_C18POS_16 | | | 21.425 | | | -0.804 | 0.001 | -0.082 | | | 10.445 | 0.238 | 1.080 | | | 7.005 | 0.107 | 0.848 | | |
| 6102_U3_C18POS_48 | | | 33.397 | | | 0.399 | 0.000 | 0.041 | | | -8.801 | 0.069 | -0.910 | | | 2.598 | 0.006 | 0.314 | | |
| 6102_U4_C18POS_50 | | | 21.520 | | | -2.403 | 0.012 | -0.245 | | | -8.653 | 0.162 | -0.894 | | | 7.618 | 0.125 | 0.922 | | |
| 6103_U1_C18POS_21 | | | 29.273 | | | -9.635 | 0.108 | -0.982 | | | -2.772 | 0.009 | -0.286 | | | 1.037 | 0.001 | 0.126 | | |
| 6103_U2_C18POS_60 | | | 30.106 | | | -9.772 | 0.105 | -0.996 | | | 12.656 | 0.177 | 1.308 | | | 8.463 | 0.079 | 1.025 | | |
# pull PC coordinates into df
PC_coord_nooutliers_noQC_log2 <- as.data.frame(PCA.imp_nooutliers_noQCs_log2$ind$coord)
# bind back metadata from cols 1-11
PC_coord_nooutliers_noQC_log2 <- bind_cols(imp_nooutliers_noQCs_log2[,1:11], PC_coord_nooutliers_noQC_log2)
# grab some variance explained
importance_nooutliers_noQC <- PCA.imp_nooutliers_noQCs_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_noQC <- round(importance_nooutliers_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_noQC <- round(importance_nooutliers_noQC[2,2], 2)Using FactoExtra
# scree plot
fviz_eig(PCA.imp_nooutliers_noQCs_log2)# scores plot
fviz_pca_ind(PCA.imp_nooutliers_noQCs_log2)# plot of contributions from features to PC1
(var_contrib_nooutliers_noQCs_PC1 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 1,
top = 20,
title = "Var contribution to PC1: no outliers, no QCs"))# plot of contributions from features to PC2
(var_contrib_nooutliers_noQCs_PC2 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 2,
top = 20,
title = "Var contribution to PC2: no outliers, no QCs"))# loadings
fviz_pca_var(PCA.imp_nooutliers_noQCs_log2) # nightmare
#### Manual scores plots
(PCA_nooutliers_withoutQCs <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("tomato1", "gold")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot"))ggplotly(PCA_nooutliers_withoutQCs)(PCA_nooutliers_noQCs.prepost <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no outliers"))ggplotly(PCA_nooutliers_noQCs.prepost,
tooltip = "text") (PCA_nooutliers_noQCs.MvsF <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(Sex, levels = c("M","F")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("green", "pink")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no 6106"))ggplotly(PCA_nooutliers_noQCs.MvsF,
tooltip = "text") if (!requireNamespace('BiocManager', quietly = TRUE))
install.packages('BiocManager')
BiocManager::install('PCAtools')library(PCAtools)# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_outliers_forPCAtools <- as.data.frame(t(imp_clust)) # transpose df
names(imp_clust_omicsdata_outliers_forPCAtools) <- imp_clust_omicsdata_outliers_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_outliers_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[-1,] # remove sample ID row
# create metadata df suitable for PCAtools pckg
metadata_outliers_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_outliers_forPCAtools <- match(rownames(metadata_outliers_forPCAtools), colnames(imp_clust_omicsdata_outliers_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_outliers_reordered_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[ ,order_outliers_forPCAtools]
# change abundance df to numeric
abundata_outliers_reordered_forPCAtools <- abundata_outliers_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_outliers_forPCAtools <- log2(abundata_outliers_reordered_forPCAtools)
# unite pre_post column with intervention column to create pre_intervention column
metadata_outliers_forPCAtools <- metadata_outliers_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p_outliers <- PCAtools::pca(log2_abundata_outliers_forPCAtools,
metadata = metadata_outliers_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p_outliers,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = ) biplot(p_outliers,
lab = paste0(metadata_outliers_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 1.0,
xlim = c(-100,150), ylim = c(-80, 80),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with 95% Confidence Interval",
subtitle = "Log2 transformed data, C18 (+), with outliers, no QCs")(PCA.colby.prevspost_outliers <- biplot(p_outliers,
lab = NULL,
# or lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with Loadings",
subtitle = "Log2 transformed data, C18 (+), without QCs but with outliers",
showLoadings = TRUE))# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_forPCAtools <- as.data.frame(t(imp_clust)) # transpose df
names(imp_clust_omicsdata_forPCAtools) <- imp_clust_omicsdata_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools[-1,] # remove sample ID row
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools %>%
dplyr::select(!contains("QC")) # remove QC observations
# create metadata df suitable for PCAtools pckg
metadata_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_forPCAtools <- match(rownames(metadata_forPCAtools), colnames(imp_clust_omicsdata_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_reordered_forPCAtools <- imp_clust_omicsdata_forPCAtools[ ,order_forPCAtools]
# change abundance df to numeric
abundata_reordered_forPCAtools <- abundata_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_forPCAtools <- log2(abundata_reordered_forPCAtools)
# remove outlier subj from both df
log2_abundata_forPCAtools <- log2_abundata_forPCAtools %>%
dplyr::select(!contains("6106")) %>%
dplyr::select(!contains("6112"))
metadata_forPCAtools <- metadata_forPCAtools %>%
filter(Subject != 6106,
Subject != 6112)
# unite pre_post column with intervention column to create pre_intervention column
metadata_forPCAtools <- metadata_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p <- PCAtools::pca(log2_abundata_forPCAtools,
metadata = metadata_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = )Horn’s parallel analysis
horn <- parallelPCA(log2_abundata_forPCAtools)
horn$n## [1] 9
Elbow method
elbow <- findElbowPoint(p$variance)
elbow## PC5
## 5
screeplot(p,
components = getComponents(p, 1:20),
vline = c(horn$n, elbow)) +
geom_label(aes(x = horn$n + 1, y = 50,
label = 'Horn\'s', vjust = -1, size = 8)) +
geom_label(aes(x = elbow + 1, y = 50,
label = 'Elbow method', vjust = -3, size = 8))How many PCs do we need to capture at least 80% variance?
which(cumsum(p$variance) > 80)[1]## PC14
## 14
biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
# ellipse config
ellipse = TRUE,
ellipseType = 't', # assumes multivariate t-distribution
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 0,
xlim = c(-50,50), ylim = c(-30,25),
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, C18 (+), outliers removed, no QCs \n95% confidence level ellipses")(PCA.colby.prevspost <- biplot(p,
lab = NULL,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, C18 (+), outliers removed, no QCs \n95% confidence level ellipses",
showLoadings = TRUE))(PCA_pairsplot.colby.prevspost <-
pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "pink",
"post_Red" = "red4"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')))(PCA.colby.Sex <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.Sex,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.overall.prevspost <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.overall.prevspost,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.period <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Period',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.period,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Period',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.sequence <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'sequence',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.sequence,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'sequence',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))This is a cool way to explore the correlations between the metadata and the PCs! I want to look at how the metavariables correlate with PCs that account for 80% variation in the dataset.
Again: How many PCs do we need to capture at least 80% variance?
which(cumsum(p$variance) > 80)[1]## PC14
## 14
eigencorplot(p,
components = getComponents(p, 1:14), # get components that account for 80% variance
metavars = colnames(metadata_forPCAtools),
col = c('darkblue', 'blue2', 'gray', 'red2', 'darkred'),
cexCorval = 0.7,
colCorval = 'white',
fontCorval = 2,
posLab = 'bottomleft',
rotLabX = 45,
posColKey = 'top',
cexLabColKey = 1.5,
scale = TRUE,
main = 'PC1-14 metadata correlations',
colFrame = 'white',
plotRsquared = FALSE) eigencorplot(p,
components = getComponents(p, 1:14),
metavars = colnames(metadata_forPCAtools),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2,
fontCorval = 2,
posLab = 'all',
rotLabX = 45,
scale = TRUE,
main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ metadata ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))I am most interested in PCs affected by pre_post_intervention, so I think it would be good to investigate the metabolites that contribute the most to these PCs.
library(mixOmics)Data_forMPCA <- imp_metabind_clust_log2_noQCs %>%
mutate_at("Subject", as.factor)
summary(as.factor(Data_forMPCA$Subject))## 6101 6102 6103 6104 6105 6106 6108 6109 6110 6111 6112 6113
## 4 4 4 4 4 4 4 4 4 4 4 4
# make a vector for meta variables
(metavar <- Data_forMPCA[,c(1:11)] %>%
colnames())## [1] "sample_ID" "Subject" "Period"
## [4] "pre_post_intervention" "Intervention" "pre_post"
## [7] "sequence" "Intervention_week" "Sex"
## [10] "Age" "BMI"
mixOmicsPCA.result <- mixOmics::pca(Data_forMPCA[,!names(Data_forMPCA) %in% metavar],
scale = FALSE,
center = FALSE)
plotIndiv(mixOmicsPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Regular PCA, C18 (+), Log2 transformed')With all data
multilevelPCA.result <- mixOmics::pca(Data_forMPCA[,-(c(1:11))],
multilevel = Data_forMPCA$Subject,
scale = FALSE,
center = FALSE)
plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Multilevel PCA, C18 (+), Log2 transformed')plotLoadings(multilevelPCA.result, ndisplay = 75)# use tidy data
head(imp_metabind_clust_tidy_log2)## # A tibble: 6 × 14
## sample_ID Subject Period pre_post_intervention Intervention pre_post sequence
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 2 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 3 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 4 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 5 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 6 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## # ℹ 7 more variables: Intervention_week <chr>, Sex <chr>, Age <chr>, BMI <chr>,
## # mz_rt <chr>, rel_abund <dbl>, rel_abund_log2 <dbl>
# remove QCs
df_for_stats <- imp_metabind_clust_tidy_log2 %>%
filter(Intervention != "QC")
# check if QCs were removed
unique(df_for_stats$Intervention)## [1] "Red" "Yellow"
# df without outliers
df_for_stats_noOutlier <- df_for_stats %>%
filter(Subject != "6106",
Subject != "6112")
# check if outlier was removed
unique(df_for_stats_noOutlier$Subject)## [1] "6101" "6102" "6103" "6104" "6105" "6108" "6109" "6110" "6111" "6113"
# turn off sci notation outputs
options(scipen = 999)Here, I am comparing pre- to post-intervention for both yellow and tomato soy (Red) juice interventions. I also compare post-yellow to post-red intervention. I am using the log transformed values of rel abundance since parametric tests assume normality.
# run paired t-tests for control intervention
ctrl_t.test_paired <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_t.test_paired_sig <- ctrl_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_t.test_paired_sig)## # A tibble: 157 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 100.0756_0.… rel_… post pre 12 12 -2.37 11 3.72e-2 *
## 2 111.044_2.9… rel_… post pre 12 12 -2.47 11 3.09e-2 *
## 3 114.0669_0.… rel_… post pre 12 12 -2.46 11 3.19e-2 *
## 4 132.0768_0.… rel_… post pre 12 12 -3.61 11 4.08e-3 **
## 5 136.0482_0.… rel_… post pre 12 12 3.04 11 1.13e-2 *
## 6 141.0659_0.… rel_… post pre 12 12 -2.45 11 3.21e-2 *
## 7 142.0862_0.… rel_… post pre 12 12 -3.04 11 1.13e-2 *
## 8 149.0598_7.… rel_… post pre 12 12 2.76 11 1.87e-2 *
## 9 156.0769_0.… rel_… post pre 12 12 -4.64 11 7.15e-4 ***
## 10 159.1492_0.… rel_… post pre 12 12 -2.52 11 2.87e-2 *
## # ℹ 147 more rows
# how many are significant?
nrow(ctrl_t.test_paired_sig)## [1] 157
# run paired t-tests for control intervention
ctrl_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_noOutlier_t.test_paired_sig <- ctrl_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_noOutlier_t.test_paired_sig)## # A tibble: 112 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 111.044_2.9… rel_… post pre 10 10 -2.30 9 0.0468 *
## 2 132.0768_0.… rel_… post pre 10 10 -3.14 9 0.012 *
## 3 136.0482_0.… rel_… post pre 10 10 2.32 9 0.0456 *
## 4 142.0862_0.… rel_… post pre 10 10 -2.56 9 0.0308 *
## 5 144.1283_0.… rel_… post pre 10 10 2.33 9 0.0451 *
## 6 149.0598_7.… rel_… post pre 10 10 2.27 9 0.0497 *
## 7 156.0769_0.… rel_… post pre 10 10 -4.08 9 0.00276 **
## 8 162.1126_0.… rel_… post pre 10 10 -3.26 9 0.0099 **
## 9 163.1243_0.… rel_… post pre 10 10 -4.09 9 0.00273 **
## 10 166.0862_0.… rel_… post pre 10 10 -2.38 9 0.0413 *
## # ℹ 102 more rows
# how many are significant?
nrow(ctrl_noOutlier_t.test_paired_sig)## [1] 112
# run paired t-tests for control intervention
red_t.test_paired <- df_for_stats %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_t.test_paired_sig <- red_t.test_paired %>%
filter(p <= 0.05)
tibble(red_t.test_paired_sig)## # A tibble: 83 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 111.044_2.9… rel_… post pre 12 12 -2.37 11 0.037 *
## 2 121.0284_3.… rel_… post pre 12 12 3.19 11 0.00866 **
## 3 135.0441_3.… rel_… post pre 12 12 -3.13 11 0.00955 **
## 4 144.1023_0.… rel_… post pre 12 12 -3.19 11 0.00855 **
## 5 145.0648_5.… rel_… post pre 12 12 -3.31 11 0.00694 **
## 6 160.0967_0.… rel_… post pre 12 12 -3.82 11 0.00284 **
## 7 166.0863_2.… rel_… post pre 12 12 -2.81 11 0.0168 *
## 8 170.0447_0.… rel_… post pre 12 12 2.72 11 0.0198 *
## 9 170.0448_2.… rel_… post pre 12 12 2.85 11 0.0157 *
## 10 181.0609_3.… rel_… post pre 12 12 -2.60 11 0.0249 *
## # ℹ 73 more rows
# how many are significant?
nrow(red_t.test_paired_sig)## [1] 83
# run paired t-tests for control intervention
red_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_noOutlier_t.test_paired_sig <- red_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(red_noOutlier_t.test_paired_sig)## # A tibble: 74 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 121.0284_3.… rel_… post pre 10 10 3.49 9 0.00678 **
## 2 121.0646_4.… rel_… post pre 10 10 2.96 9 0.016 *
## 3 135.0441_3.… rel_… post pre 10 10 -2.78 9 0.0213 *
## 4 138.0551_0.… rel_… post pre 10 10 2.77 9 0.0216 *
## 5 144.1023_0.… rel_… post pre 10 10 -2.92 9 0.0171 *
## 6 145.0648_5.… rel_… post pre 10 10 -2.68 9 0.0253 *
## 7 160.0967_0.… rel_… post pre 10 10 -3.75 9 0.00455 **
## 8 166.0863_2.… rel_… post pre 10 10 -2.77 9 0.0218 *
## 9 190.0499_3.… rel_… post pre 10 10 -3.91 9 0.00358 **
## 10 196.0604_3.… rel_… post pre 10 10 3.26 9 0.00992 **
## # ℹ 64 more rows
# how many are significant?
nrow(red_noOutlier_t.test_paired_sig)## [1] 74
# run paired t-tests for post interventions
post_t.test_paired <- df_for_stats %>%
filter(pre_post == "post") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_t.test_paired_sig <- post_t.test_paired %>%
filter(p <= 0.05)
tibble(post_t.test_paired_sig)## # A tibble: 30 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 195.0647_0.… rel_… Red Yellow 12 12 -2.23 11 4.79e-2 *
## 2 203.139_0.6… rel_… Red Yellow 12 12 2.33 11 3.98e-2 *
## 3 220.1367_4.… rel_… Red Yellow 12 12 2.27 11 4.41e-2 *
## 4 234.1122_4.… rel_… Red Yellow 12 12 -6.46 11 4.69e-5 ****
## 5 250.1107_3.… rel_… Red Yellow 12 12 -4.21 11 1.46e-3 **
## 6 255.0655_5.… rel_… Red Yellow 12 12 10.6 11 4.26e-7 ****
## 7 271.0596_4.… rel_… Red Yellow 12 12 12.9 11 5.47e-8 ****
## 8 271.1656_4.… rel_… Red Yellow 12 12 2.33 11 3.97e-2 *
## 9 274.1833_5.… rel_… Red Yellow 12 12 2.34 11 3.92e-2 *
## 10 277.1432_7.… rel_… Red Yellow 12 12 3.90 11 2.48e-3 **
## # ℹ 20 more rows
# how many are significant?
nrow(post_t.test_paired_sig)## [1] 30
# run paired t-tests for post interventions
post_noOutlier_t.test_paired <- df_for_stats %>%
filter(pre_post == "post",
Subject != "6106") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_noOutlier_t.test_paired_sig <- post_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(post_noOutlier_t.test_paired_sig)## # A tibble: 32 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 142.0862_0.… rel_… Red Yellow 11 11 2.58 10 2.73e-2 *
## 2 159.1492_0.… rel_… Red Yellow 11 11 2.97 10 1.41e-2 *
## 3 160.0967_0.… rel_… Red Yellow 11 11 -2.24 10 4.88e-2 *
## 4 219.1705_0.… rel_… Red Yellow 11 11 -2.36 10 4.01e-2 *
## 5 220.1367_4.… rel_… Red Yellow 11 11 2.27 10 4.66e-2 *
## 6 230.9902_0.… rel_… Red Yellow 11 11 -2.27 10 4.65e-2 *
## 7 234.1122_4.… rel_… Red Yellow 11 11 -4.80 10 7.22e-4 ***
## 8 250.1107_3.… rel_… Red Yellow 11 11 -2.88 10 1.64e-2 *
## 9 255.0655_5.… rel_… Red Yellow 11 11 9.64 10 2.23e-6 ****
## 10 265.1278_3.… rel_… Red Yellow 11 11 2.46 10 3.39e-2 *
## # ℹ 22 more rows
# how many are significant?
nrow(post_noOutlier_t.test_paired_sig)## [1] 32
Are there any significant features shared between tests with and without outlier?
post_sig_outlier_comp <- list(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig) %>%
reduce(inner_join, by = "mz_rt")
tibble(post_sig_outlier_comp)## # A tibble: 22 × 19
## mz_rt .y..x group1.x group2.x n1.x n2.x statistic.x df.x p.x
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 220.1367_4.453 rel_a… Red Yellow 11 11 2.27 10 4.66e-2
## 2 234.1122_4.883 rel_a… Red Yellow 11 11 -4.80 10 7.22e-4
## 3 250.1107_3.546 rel_a… Red Yellow 11 11 -2.88 10 1.64e-2
## 4 255.0655_5.039 rel_a… Red Yellow 11 11 9.64 10 2.23e-6
## 5 271.0596_4.449 rel_a… Red Yellow 11 11 11.6 10 4.17e-7
## 6 271.1656_4.57 rel_a… Red Yellow 11 11 2.25 10 4.84e-2
## 7 277.1432_7.079 rel_a… Red Yellow 11 11 3.25 10 8.72e-3
## 8 302.1959_0.798 rel_a… Red Yellow 11 11 2.96 10 1.42e-2
## 9 316.1385_4.504 rel_a… Red Yellow 11 11 11.8 10 3.45e-7
## 10 385.1464_0.793 rel_a… Red Yellow 11 11 -2.42 10 3.64e-2
## # ℹ 12 more rows
## # ℹ 10 more variables: p.signif.x <chr>, .y..y <chr>, group1.y <chr>,
## # group2.y <chr>, n1.y <int>, n2.y <int>, statistic.y <dbl>, df.y <dbl>,
## # p.y <dbl>, p.signif.y <chr>
# how many sig features are shared between df vs df w/o outliers
nrow(post_sig_outlier_comp)## [1] 22
# return sig features present only in df with outlier, and not in df without outlier
tibble(anti_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))## # A tibble: 10 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 142.0862_0.… rel_… Red Yellow 11 11 2.58 10 0.0273 *
## 2 159.1492_0.… rel_… Red Yellow 11 11 2.97 10 0.0141 *
## 3 160.0967_0.… rel_… Red Yellow 11 11 -2.24 10 0.0488 *
## 4 219.1705_0.… rel_… Red Yellow 11 11 -2.36 10 0.0401 *
## 5 230.9902_0.… rel_… Red Yellow 11 11 -2.27 10 0.0465 *
## 6 265.1278_3.… rel_… Red Yellow 11 11 2.46 10 0.0339 *
## 7 303.0862_4.… rel_… Red Yellow 11 11 -2.37 10 0.0392 *
## 8 330.2275_4.… rel_… Red Yellow 11 11 3.43 10 0.00641 **
## 9 357.1179_3.… rel_… Red Yellow 11 11 -3.24 10 0.00893 **
## 10 369.1289_3.… rel_… Red Yellow 11 11 2.23 10 0.0495 *
# return sig features from df without outlier that are also present in df with outlier
kable(semi_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))| mz_rt | .y. | group1 | group2 | n1 | n2 | statistic | df | p | p.signif |
|---|---|---|---|---|---|---|---|---|---|
| 220.1367_4.453 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.269912 | 10 | 0.0466000 | * |
| 234.1122_4.883 | rel_abund_log2 | Red | Yellow | 11 | 11 | -4.801695 | 10 | 0.0007220 | *** |
| 250.1107_3.546 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.878823 | 10 | 0.0164000 | * |
| 255.0655_5.039 | rel_abund_log2 | Red | Yellow | 11 | 11 | 9.635113 | 10 | 0.0000022 | **** |
| 271.0596_4.449 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.552075 | 10 | 0.0000004 | **** |
| 271.1656_4.57 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.247091 | 10 | 0.0484000 | * |
| 277.1432_7.079 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.249989 | 10 | 0.0087200 | ** |
| 302.1959_0.798 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.964711 | 10 | 0.0142000 | * |
| 316.1385_4.504 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.789444 | 10 | 0.0000003 | **** |
| 385.1464_0.793 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.415252 | 10 | 0.0364000 | * |
| 429.2112_4.041 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.634179 | 10 | 0.0250000 | * |
| 431.0964_3.273 | rel_abund_log2 | Red | Yellow | 11 | 11 | 12.651878 | 10 | 0.0000002 | **** |
| 431.0966_3.757 | rel_abund_log2 | Red | Yellow | 11 | 11 | 14.104093 | 10 | 0.0000001 | **** |
| 431.0972_4.035 | rel_abund_log2 | Red | Yellow | 11 | 11 | 17.150081 | 10 | 0.0000000 | **** |
| 433.1121_3.884 | rel_abund_log2 | Red | Yellow | 11 | 11 | 10.996072 | 10 | 0.0000007 | **** |
| 433.1123_3.32 | rel_abund_log2 | Red | Yellow | 11 | 11 | 12.348794 | 10 | 0.0000002 | **** |
| 435.1282_4.891 | rel_abund_log2 | Red | Yellow | 11 | 11 | 17.807699 | 10 | 0.0000000 | **** |
| 447.0916_4.229 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.780509 | 10 | 0.0000003 | **** |
| 449.1046_3.633 | rel_abund_log2 | Red | Yellow | 11 | 11 | 4.053527 | 10 | 0.0023100 | ** |
| 449.1059_4.17 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.200105 | 10 | 0.0094900 | ** |
| 461.1074_3.814 | rel_abund_log2 | Red | Yellow | 11 | 11 | 18.846858 | 10 | 0.0000000 | **** |
| 461.1078_3.304 | rel_abund_log2 | Red | Yellow | 11 | 11 | 18.880964 | 10 | 0.0000000 | **** |
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yellow_volcano_data <- df_for_stats %>%
filter(pre_post == "post") %>%
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yellow_tobind <- post_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yellow_volcano_data <-
bind_cols(red_v_yellow_volcano_data, red_v_yellow_tobind) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. I will choose 8. At this point I don't have any absent features so the data shouldn't change.
red_v_yellow_volcano_data <- red_v_yellow_volcano_data %>%
mutate(log2_FC_postRed_div_postYellow = if_else(is.infinite(log2_FC_postRed_div_postYellow),
8, log2_FC_postRed_div_postYellow))
# create a df of features passing FC and pval cutoffs higher in post-Red
higher_postRed <- red_v_yellow_volcano_data %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 1)
# create a df of features passing FC and pval cutoffs higher in post-control
higher_postYellow <- red_v_yellow_volcano_data %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -1) write_csv(higher_postRed,
"intervention-comp-sig-RED-C18Pos-05Jun23.csv")
write_csv(higher_postYellow,
"intervention-comp-sig-YELLOW-C18Pos-05Jun23.csv")(red_v_yellow_volcano <- red_v_yellow_volcano_data %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = higher_postRed,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = higher_postYellow,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption",
caption = "Vertical dashed lines represent a log fold change >1 or < -1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly <- ggplotly(red_v_yellow_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano,
filename = "red_v_yellow_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly,
file = "interactive_redvyellow_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yel_volcano_data_noOutlier <- df_for_stats %>%
filter(pre_post == "post",
Subject != 6106,
Subject != 6112) %>% # remove outlier subj
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yel_tobind_noOutlier <- post_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yel_volcano_data_noOutlier <-
bind_cols(red_v_yel_volcano_data_noOutlier, red_v_yel_tobind_noOutlier) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. I will choose 8. At this point I don't have any absent features so the data shouldn't change.
red_v_yel_volcano_data_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
mutate(log2_FC_postRed_div_postYellow = if_else(is.infinite(log2_FC_postRed_div_postYellow),
8, log2_FC_postRed_div_postYellow))
# create a df of features passing FC and pval cutoffs higher in post-Red
higher_postRed_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 1)
# create a df of features passing FC and pval cutoffs higher in post-control
higher_postYellow_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -1) write_csv(higher_postRed_noOutlier,
"intervention-comp-sig-RED-nooutliers-C18Pos-05Jun23.csv")
write_csv(higher_postYellow_noOutlier,
"intervention-comp-sig-YELLOW-nooutliers-C18Pos-05Jun23.csv")(red_v_yellow_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = higher_postRed_noOutlier,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = higher_postYellow_noOutlier,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption. Subject 6106 removed",
caption = "Vertical dashed lines represent a log fold change >1 or < -1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly_noOutlier <- ggplotly(red_v_yellow_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano_noOutlier,
filename = "red_v_yellow_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly_noOutlier,
file = "interactive_redvyellow_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data <- df_for_stats %>%
filter(Intervention == "Red") %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind <- red_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data <-
bind_cols(red_volcano_data, red_tobind) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#red_volcano_data <- red_volcano_data %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_higher_post <- red_volcano_data %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)Save file of features that pass FC and pvalue cutoffs
write_csv(red_higher_post,
"pre-post-sig-RED-C18Pos-05Jun23.csv")(red_volcano <- red_volcano_data %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_higher_post,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_volcano_ggplotly <- ggplotly(red_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_volcano,
filename = "red_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly,
file = "interactive_red_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Red",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind_noOutlier <- red_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data_noOutlier <-
bind_cols(red_volcano_data_noOutlier, red_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#red_volcano_data_noOutlier <- red_volcano_data_noOutlier %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_higher_post_noOutlier <- red_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(red_higher_post_noOutlier,
"pre-post-sig-RED-nooutliers-C18Pos-05Jun23.csv")(red_volcano_noOutlier <- red_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_higher_post_noOutlier,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_volcano_ggplotly_noOutlier <- ggplotly(red_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_volcano_noOutlier,
filename = "red_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly_noOutlier,
file = "interactive_red_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind <- ctrl_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data <-
bind_cols(yel_volcano_data, yel_tobind) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#yel_volcano_data <- yel_volcano_data %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yellow_higher_post <- yel_volcano_data %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(yellow_higher_post,
"pre-post-sig-YELLOW-C18Pos-05Jun23.csv")(yellow_volcano <- yel_volcano_data %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yellow_higher_post,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(yellow_volcanoo_ggplotly <- ggplotly(yellow_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = yellow_volcano,
filename = "yellow_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly,
file = "interactive_red_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Yellow",
Subject != 6106,
Subject != 6106) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind_noOutlier <- ctrl_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data_noOutlier <-
bind_cols(yel_volcano_data_noOutlier, yel_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#yel_volcano_data_noOutlier <- yel_volcano_data_noOutlier %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yel_higher_post_noOutlier <- yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(yel_higher_post_noOutlier,
"pre-post-sig-YELLOW-nooutliers-C18Pos-05Jun23.csv")(yel_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yel_higher_post_noOutlier,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption.\nSubject 6106 removed",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(yel_volcano_ggplotly_noOutlier <- ggplotly(yel_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = yel_volcano_noOutlier,
filename = "yel_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = yel_volcano_ggplotly_noOutlier,
file = "interactive_yel_volcano_plot_noOutlier.html")